Setting Parameters:

To keep in mind: - Make sure the plates to combine all have the same amount of “_” separated fields in their folder names. These fields will be used to set up the phenodata columns. - The Combined ID per plate, will be used for labelling in figures.

## Parameters for processing dataset and metadata ##
barcode_file = "../scrna/barcode_384.tab"
MT_genes_file = "../scrna/genomes/genome_addons/MT_genes.txt"
# Edit plate names with substitutes:
old_col_pattern = "_S"
new_col_pattern = "-S"
# Retrieve the variables from the platenames (fields seperated with "_"):
plate_variables = c("Genome", "Spikein", "Reporter", "Method", "Lineage", "Timepoint", "Replicate" , "Library", "Well")
# Unique combined ID per plate, for visualization purposes
combined_variables_to_id = c("Method", "Lineage", "Timepoint", "Library")

## Filtering of the dataset ##
# Settings for genes
gene_tresh = 1
amount_cells_expr = 2
# Settings for cells
total_counts_tresh = 1000
total_feat_tresh = 500
ERCC_pct_max <- 20
mt_pct_max <- 50

## Subsetting the dataset ##
# Specify a specific (sub)string to select the cells on (selection happens on the colnames)
subset_id = ""
# Select the type of filtering: keep cells with the substring (set "in") or remove (set "out")
# if NO filtering is wanted, leave empty (set "" or "no")
filtering = "no"

## Seurat Normalization, HVG selection (vst) & Scaling (and Regression) ##
nHVG = 2000
# Regression performed on the following variables:
vars_to_regress = c("nCount_sf") # If no regression desired: NULL

## Dimensionality reduction ##
# For PCA to run on
pcs_max = 70 
# PCs used for different UMAP representations
pcs_for_overview = c(5, 10, 20, 25, 30, 32, 35, 37, 40, 50)

## Visualization ##
# label from phenodata to color in Scater and Seurat plots
lab_col = "Library"
umap_col = "Method"
umap_col2 = "Timepoint"
# Checking variability explained by confounding factors
confounders_to_test = c("Library","Lineage","Timepoint")
# Marker genes for violin plots
explore_violin = c("NKX2-5", "MYL7")

## Storing results ##
workdir <- "../snabel/scrna/CMdiff_mon-EB-EHT_combined/rerun_v4/preprocess_R/"
# if regression is performed: this will already be included in the folder name
result_descript = "_results_complete"

## Location of scripts ##
source("../R-scripts/scRNA-seq/read_kb_dataset_s2s.R")
source("../R-scripts/scRNA-seq/qc_umis_384plot.R")
source("../R-scripts/scRNA-seq/qc_ercc_384plot.R")
system(paste("mkdir -p ", workdir))
knitr::opts_knit$set(root.dir = normalizePath(workdir))
set.seed(42)

Cleaning up the counts table, for subset:


Creating the SingleCellExperiment object

The counts table is loaded along with the metadata of the cells within an Scater usable object. Scater will be used to look into the quality of the data and to help with filtering out bad cells or genes.

Location of the file:

## Splice seperated dataset:
spliced.data.df = read_kb_counts("../../results/kallistobus/", "spliced", barcode_file = barcode_file)
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d10-1-942"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d12-1-944"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-1-18791"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-1-946"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-2-18771"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d21-1-948"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d5-1-18778"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d5-1-936"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d7-1-938"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d8-1-18769"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d8-1-940"
## [1] "Reading: GRCh38_ERCC_reporter-EB-shared-d4-1-18765"
## [1] "Reading: GRCh38_ERCC_reporter-EB-shared-d4-1-934"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d10-1-941"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d12-1-943"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-1-945"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-1-Z45_S7"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-2-18770"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d18-1-Z48_S6"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d18-2-18784"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d21-1-947"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d5-1-18766"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d5-1-935"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d7-1-937"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d8-1-18768"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d8-1-939"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_AM_d12_15ul_Z14_S4"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_AM_d12_5ul_Z12_S2"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_VM_d12_15ul_Z13_S3"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_VM_d12_5ul_Z11_S1"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m12-1-Z46_S4"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m12-2-18776"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m5-1-18773"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m5-2-18782"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m11-1-Z43_S1"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m11-2-18788"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m5-1-18781"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m5-2-18772"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m12-1-18786"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m12-2-18774"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m5-1-Z41_S3"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m5-2-18783"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d10-1-18779"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d10-1-820"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d11-1-821"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d11-1-822"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d12-1-824"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d13-1-826"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-1-828"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-1-Z42_S8"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-2-18780"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d4-1-18764"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d5-1-18777"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d7-1-813"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d7-1-814"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-18767"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-815"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-816"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d9-1-817"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m11-1-Z44_S2"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m11-2-18775"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m5-1-Z47_S5"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m5-2-18785"
unspliced.data.df = read_kb_counts("../../results/kallistobus/", "unspliced", barcode_file = barcode_file)
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d10-1-942"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d12-1-944"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-1-18791"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-1-946"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d14-2-18771"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d21-1-948"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d5-1-18778"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d5-1-936"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d7-1-938"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d8-1-18769"
## [1] "Reading: GRCh38_ERCC_reporter-EB-AM-d8-1-940"
## [1] "Reading: GRCh38_ERCC_reporter-EB-shared-d4-1-18765"
## [1] "Reading: GRCh38_ERCC_reporter-EB-shared-d4-1-934"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d10-1-941"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d12-1-943"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-1-945"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-1-Z45_S7"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d14-2-18770"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d18-1-Z48_S6"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d18-2-18784"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d21-1-947"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d5-1-18766"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d5-1-935"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d7-1-937"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d8-1-18768"
## [1] "Reading: GRCh38_ERCC_reporter-EB-VM-d8-1-939"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_AM_d12_15ul_Z14_S4"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_AM_d12_5ul_Z12_S2"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_VM_d12_15ul_Z13_S3"
## [1] "Reading: GRCh38_ERCC_reporter-EHT_VM_d12_5ul_Z11_S1"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m12-1-Z46_S4"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m12-2-18776"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m5-1-18773"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-AM-m5-2-18782"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m11-1-Z43_S1"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m11-2-18788"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m5-1-18781"
## [1] "Reading: GRCh38_ERCC_reporter-EHT-VM-m5-2-18772"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m12-1-18786"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m12-2-18774"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m5-1-Z41_S3"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-AM-m5-2-18783"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d10-1-18779"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d10-1-820"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d11-1-821"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d11-1-822"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d12-1-824"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d13-1-826"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-1-828"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-1-Z42_S8"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d14-2-18780"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d4-1-18764"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d5-1-18777"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d7-1-813"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d7-1-814"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-18767"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-815"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d8-1-816"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-d9-1-817"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m11-1-Z44_S2"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m11-2-18775"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m5-1-Z47_S5"
## [1] "Reading: GRCh38_ERCC_reporter-Mon-VM-m5-2-18785"
## Optional edits on cell names:
# This only runs if a substring that needs replacement was defined in parameters (old_col_pattern):
if (old_col_pattern != ""){
  colnames(spliced.data.df) <- gsub(old_col_pattern, new_col_pattern, colnames(spliced.data.df))
  colnames(unspliced.data.df) <- gsub(old_col_pattern, new_col_pattern, colnames(unspliced.data.df))
}

Setting up results directory

# if regression will be performed, this is included in the folder name.
if (!is.null(vars_to_regress)){
result_descript <- paste(result_descript, "regressed", sep = "_")  
}

dateoftoday <- gsub("-", "", as.character(Sys.Date()))
resultsdir <- paste0(workdir, dateoftoday, result_descript)
system(paste("mkdir -p ", resultsdir))
knitr::opts_knit$set(root.dir = normalizePath(resultsdir))

Results will be stored in: resultsdir

# Percentage of reads unspliced
sum(unspliced.data.df)/(sum(spliced.data.df)+sum(unspliced.data.df))
## [1] 0.3698399
# unspliced.data.df <- as.matrix(unspliced.data.df)
# spliced.data.df <- as.matrix(spliced.data.df)
# Make columnnames the same (order) between matrices

all_cells <- intersect(colnames(spliced.data.df),colnames(unspliced.data.df))

unspliced.data.df <- unspliced.data.df[,all_cells]
spliced.data.df <- spliced.data.df[,all_cells]

identical(colnames(spliced.data.df),colnames(unspliced.data.df))
## [1] TRUE
# The default data.df will be the spliced dataset (shorter to type)
data.df <- spliced.data.df

Perform subsetting of dataset (optional)

if (filtering == "out"){
  # checking the amount of cells in the dataset before filtering
  length(colnames(data.df))
  # the amount of cells you retrieve with the filter.
  length(colnames(data.df[,!grepl(subset_id, colnames(data.df)) == TRUE]))
  # filter cells based on the substring.
  data.df <- data.df[,!grepl(subset_id, colnames(data.df)) == TRUE]
} else if (filtering == "in"){
  # checking the amount of cells in the dataset before filtering
  length(colnames(data.df))
  # the amount of cells you retrieve with the filter.
  length(colnames(data.df[,grepl(subset_id, colnames(data.df)) == TRUE]))
  # filter cells based on the substring.
  data.df <- data.df[,grepl(subset_id, colnames(data.df)) == TRUE]
} else {
  print(paste0("No filtering applied. The amount of cells in the dataset remain: ", as.character(length(colnames(data.df)))))
}
## [1] "No filtering applied. The amount of cells in the dataset remain: 24188"
spliced.data.df <- data.df
subset_cells <- intersect(colnames(spliced.data.df), colnames(unspliced.data.df))
unspliced.data.df <- unspliced.data.df[,subset_cells]

# no. of plates:
length(colnames(spliced.data.df))/384
## [1] 62.98958
## Setting up the phenotable ##
phenodata <- data.frame(row.names=colnames(data.df))
phenodata$names <- row.names(phenodata)
phenodata <- separate(phenodata, col = "names", into = plate_variables, sep = "_")

## Replace by tinyverse using the columns mentioned with combined_variables_to_id
phenodata$combined_id <- apply(phenodata[,combined_variables_to_id], 1, paste, collapse = "_")

# Only take the entries that are matchable with the counttable entries:
pheno_matched <- phenodata[rownames(phenodata) %in% colnames(data.df),]

# Matching phenodata with the dataset ordering
pheno_ordered <- pheno_matched[match(colnames(data.df),rownames(pheno_matched)),]
identical(rownames(pheno_ordered), colnames(spliced.data.df))
## [1] TRUE
write.csv(phenodata, "phenodata_kbordered.csv")

Plate Overviews

Running QC over the plates: are there

## Running plate QC: are there certain patterns?

# Make a list of cell-names compatable with the excel file: plate#_A1, plate#_A2 etc.
plate_order <- read.table(barcode_file, sep = "\t", col.names = c("well","barcode"))

# Make a vector with all plate numbers
platenrs <- unique(gsub("([^_]*)$", "", colnames(data.df)))
pdf("PlateDiag_lndscp.pdf", paper = "USr")
# settings for the plate diagnostics pdf 
par(mfrow=c(2,2), mar = c(5,4,4,2) + 0.1, cex.main = 1)

# Iterate over all plates, order cells in the order of visualization
for (plate in platenrs){
  # use the order of cells from te barcode file (this is A1, A2, A3, etc to P24)
  primer_order <- paste(plate, plate_order$well, sep="")
  
  # if wells are missing on the plate, these need to be added and given a value of 0
  missing_wells <- primer_order[!primer_order %in% colnames(spliced.data.df)]
  cols_to_add <- data.frame(matrix(ncol = length(missing_wells), nrow = length(rownames(spliced.data.df))))
  colnames(cols_to_add) <- missing_wells
  cols_to_add[is.na(cols_to_add)] <- 0
  diag_plate <- cbind(spliced.data.df[,grep(plate, colnames(spliced.data.df))], cols_to_add)
  # phenodata contains same cellid entry + rowname as used in dataset
  cells_order <- colnames(diag_plate[,match(primer_order, colnames(diag_plate))])
  
  # match dataset cells order with wells in the visualization
  tmp <- as.matrix(diag_plate[,cells_order])
  QC_umis_384plot(tmp, paste(plate, "UMI_QC", sep = "_"))
  QC_ERCC_384plot(tmp[grep("^ERCC", rownames(diag_plate)),], paste(plate, "ERCC_QC", sep = "_"))
  
  rm(tmp)
}
dev.off()
## png 
##   2

Creating a SingleCellExperiment object for confounder check

# df -> matrix + phenodata -> SCE 
count_matrix <- as.matrix(data.df)
sce <- SingleCellExperiment(assays = list(counts = count_matrix), colData = pheno_ordered, rowData = rownames(count_matrix))
## character(0)
saveRDS(sce, file = "unfiltered_spliced_counts_sce.rds")

Cleaning the expression matrix

Setting thresholds for the removal of genes too lowly expressed in too few cells.

MT_genes <- read.table(MT_genes_file)[,1]

sce$nCounts <- colSums(counts(sce))
sce$nGene <- apply(counts(sce), 2, function(x) length(x[x>0]))

# Adding spike-in and MT count QC & removes ERCC and MT genes.
is.spike <- grepl("^ERCC-", rownames(sce))
sce <- splitAltExps(sce, ifelse(is.spike, "ERCC", "gene"))

is.MT <- rownames(sce) %in% MT_genes
sce <- splitAltExps(sce, ifelse(is.MT, "MT", "gene"))

# Calculate the quality metrics:
sce <- addPerCellQCMetrics(sce)

Distribution of counts per cell in the dataset

Manually setting arbitrary count tresholds for the cells considered healthy.

# Looking at the total number of RNA molecules per sample
# UMI counts were used for this experiment
hist(sce$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")

# Looking at the amount of unique genes per sample
# This is the amount with ERCC included.
hist(sce$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")

Histogram showing the total amounts of counts (x-axis) per proportion of cells (each bar). Red line at: 1000 counts. Histogram showing the total amounts of genes (features) per proportion of cells. Red line at: 500 genes.

Plotting spike-in data

Spike-ins and mitochondrial expression are used as another measure for quality of the cells. A overrepresentation of spikes and mitochondrial transcript might indicate a “unhealthy” cell or poor library. These plots show the percentage of spike-ins against the total amount of reads that are found in each cell.

A higher percentage of spike-in indicates a lower amount of endogenous genes found in the cell or in case of mitochondrial genes, a cell that was apoptotic. Also cells that are smaller will have relatively more spike-in allocated reads, and some cell types might have higher numbers of mitochondria, which is important to consider setting this boundary.

# Removal of cells causing a warning:
NaN_cells <- unique(c(colnames(sce)[sce$altexps_ERCC_percent == "NaN"], colnames(sce)[sce$altexps_MT_percent == "NaN"]))
sce <- sce[,!colnames(counts(sce)) %in% NaN_cells]

# Using Scater to plot percentages of spikes
plotColData(sce,
            x = "nGene", 
            y = "altexps_MT_percent", colour = lab_col)

plotColData(sce,
            x = "nGene", 
            y = "altexps_ERCC_percent", colour = lab_col)

multiplot(
  plotColData(sce, y="nCounts", x="Library"),
  plotColData(sce, y="nGene", x="Library"),
  plotColData(sce, y="altexps_MT_percent", x="Library"),
  plotColData(sce, y="altexps_ERCC_percent", x="Library"),
  cols=2)
## Warning: 'multiplot' is deprecated.
## Use 'gridExtra::grid.arrange' instead.
## See help("Deprecated")

Plotting the percentages of the spike-ins against the total amount of genes, each dot represents a cell. Color labelled on Library.

#———————— ## Filtering the cells ## #————————

Using manual thresholds for filtering out the outliers in the dataset.

#---------------------------
## Manually set thresholds for filtering of the cells:
#---------------------------
# Filter library-size and the total amount of genes on the thresholds shown above in the histogram.
filter_by_expr_features <- sce$nGene >= total_feat_tresh
filter_by_total_counts <- sce$nCounts >= total_counts_tresh
filter_by_ercc <- sce$altexps_ERCC_percent < ERCC_pct_max
filter_by_mt <- sce$altexps_MT_percent < mt_pct_max

sce$use <- (filter_by_expr_features 
         & 
           filter_by_total_counts 
         &
           filter_by_ercc 
         & 
           filter_by_mt
           )

# Amount of cells removed per filtering:
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE  TRUE 
## 10855 13332
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE  TRUE 
## 10744 13443
table(filter_by_ercc)
## filter_by_ercc
## FALSE  TRUE 
##  9715 14472
table(filter_by_mt)
## filter_by_mt
## FALSE  TRUE 
##   692 23495
# Result of manual filtering with set tresholds
# TRUE are considered healthy cells:
table(sce$use)
## 
## FALSE  TRUE 
## 11481 12706
# The quality check-passing cells are stored in the SCE-object in $use selection of the counts table. 
# Create the quality-checked dataset:
sce_qc <- sce[, colData(sce)$use]
dim(sce_qc)
## [1] 30590 12706

#———————— ## Filtering the genes ## #————————

#---------------------------
# UMI were used to collapse the reads of the same transcript
# Filtering on genes considered expressed: above a treshold for a set amount of cells:
keep_feature <- rowSums(counts(sce_qc) >= gene_tresh) >= amount_cells_expr

# Create the quality-checked dataset:
sce_qc <- sce_qc[keep_feature,]

genes_expressed <- sum(keep_feature==TRUE)

write.table("spliced_qc_counts.tsv", col.names = NA, quote = FALSE)
##  x
## 1 spliced_qc_counts.tsv
saveRDS(sce_qc, file = "filtered_spliced_counts_sce.rds")

Plotting the distributions of the dataset before and after filtering.

pdf("Histograms_before+aftercellsFiltering.pdf")
par(mfrow=c(2,2))
hist(sce$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")

hist(sce$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")

hist(sce_qc$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")

hist(sce_qc$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")
dev.off()
## png 
##   2
pdf("MT+ERCC_before+aftercellsFiltering.pdf")
par(mfrow=c(2,2))
plotColData(sce,
            x = "nGene", 
            y = "altexps_MT_percent", colour = lab_col)

plotColData(sce,
            x = "nGene", 
            y = "altexps_ERCC_percent", colour = lab_col)
plotColData(sce_qc,
            x = "nGene", 
            y = "altexps_MT_percent", colour = lab_col)

plotColData(sce_qc,
            x = "nGene", 
            y = "altexps_ERCC_percent", colour = lab_col)
dev.off()
## png 
##   2

In the dataset 20799 are considered expressed.

# The reads consumed by the top 50 expressed genes:
plotHighestExprs(sce_qc)

Summary of filtering genes: Genes that had less than 2 cells with an expression less than 1. For the genes in this dataset genes that were removed 9791, 20799 genes were kept.

Check for confounding factors

PCA on only the endogenous genes is used to evaluate the influence of the confounding factors.

# The PCA per default uses the top 500 most variable genes of the dataset, changeable with the argument: ntop = 500.
# Plotting the raw data without any transformation.
sce_qc <- runPCA(
  sce_qc,
  ncomponents = 50,
  exprs_values = "counts" 
)
plotReducedDim(sce_qc, dimred = "PCA",   
               colour_by = lab_col,
               size_by = "nGene")

# The PCA data is stored in the reducedDimNames as a "PCA_coldata" entry, if use_coldata = TRUE in runPCA(). If use_coldata = FALSE, this will be stored in "PCA". 
reducedDimNames(sce_qc)
## [1] "PCA"

Raw log2-transformation

To compare with other normalization methods.

assay(sce_qc, "logcounts_raw") <- log2(counts(sce_qc) + 1)

# plotReducedDim and plotPCA will do the same, with plotPCA you leave out the use_dimred="PCA" argument.
tmp <- runPCA(sce_qc, ncomponents = 50, exprs_values = "logcounts_raw")
# plot PCA after log2 transformation
plotPCA(tmp, 
        colour_by = lab_col,
        size_by = "nGene")

# One can also run tSNE in similar ways with Scater.
rm(tmp)
# The logcounts_raw is not enough to account for the technical factors between the cells.

Normalization in Seurat

Make the seurat object, ‘seuset’. In this step you could filter the cells again, these however already have been filtered before in ‘table clean-up’, where the genes were taken that have >2 cells that have an expression >1.

metadata <- data.frame(colData(sce_qc))
seuset <- CreateSeuratObject(counts = counts(sce_qc), assay = "sf", meta.data = metadata)
## Warning: The following arguments are not used: row.names
FeatureScatter(
    object = seuset, 
    feature1 = "nCount_sf", 
    feature2 = "nFeature_sf"
)

# Seurat normalization: "a global-scaling normalization method LogNormalize that normalizes the gene expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.""
seu <- seuset
seuset <- NormalizeData(
    object = seuset, 
    normalization.method = "LogNormalize", 
    scale.factor = 10000
)
# looking into the dataset
VlnPlot(
    object = seuset, 
    features = c("nFeature_sf"), 
    group.by = lab_col
)

VlnPlot(
    object = seuset, 
    features = c("nCount_sf"), 
    group.by = lab_col
)

VlnPlot(
  object = seuset, 
    features = explore_violin, 
    group.by = lab_col
)

FeatureScatter(
    object = seuset, 
    feature1 = "nCount_sf", 
    feature2 = "nFeature_sf"
)

saveRDS(seuset, "seuset_qc+norm.rds")

Check confounders before & after normalization

# Only take the entries that are matchable with the counttable entries:
filtered_cells <- intersect(rownames(pheno_ordered), colnames(seuset@assays$sf@data))
pheno_matchedseuset <- phenodata[filtered_cells,]
pheno_orderedseuset <- pheno_matchedseuset[match(colnames(seuset@assays$sf@data),rownames(pheno_matchedseuset)),]

count_matrixseuset <- as.matrix(seuset@assays$sf@data)

sce_seunorm <- SingleCellExperiment(assays = list(logcounts = count_matrixseuset), colData = colData(sce_qc), rowData = rownames(count_matrixseuset))

Identifying the variation caused by each confounding factor

Before & after normalization

explanatory_variables <- as.character(c(confounders_to_test, "nGene", "nCounts"))

# This function and visualization performs a PCA analysis in the data object and checks to what extend the variables that are put in, are explaining the variance.
# The percentage of variance explained by each variable of interest:

# Setting the colours:
colourvector <- c()
colourset <- brewer.pal(length(explanatory_variables),"Dark2")
i <- 1
for (variable_item in explanatory_variables){
  colourvector[variable_item] <- colourset[i]
  i <- i + 1 
}

# Building combined plot, before and after normalization
variance_sce_qc <- getVarianceExplained(sce_qc, variables = explanatory_variables, exprs_values = "counts")
variance_sce_seunorm <- getVarianceExplained(sce_seunorm, variables = explanatory_variables)
p1 <- plotExplanatoryVariables(variance_sce_qc) + expand_limits(y = 1) + scale_color_manual(values = colourvector) + ggtitle("Explanatory Variables Before Normalization")
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
p2 <- plotExplanatoryVariables(variance_sce_seunorm) + expand_limits(y = 1) + scale_color_manual(values = colourvector) + ggtitle("Explanatory Variables After Normalization")
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
multiplot(p1, p2)
## Warning: 'multiplot' is deprecated.
## Use 'gridExtra::grid.arrange' instead.
## See help("Deprecated")
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2 rows containing non-finite values (stat_density).

# running PCA on the normalized counts
sce_seunorm <- runPCA(
  sce_seunorm, ncomponents = 20,
  exprs_values = "logcounts" 
)
# plotting again the PCA's on raw-transformed and normalized values
# raw log-transformation.
tmp <- runPCA(sce_qc, ncomponents = 50, exprs_values = "logcounts_raw")
# PCA plot after log2 transformation
plotPCA(tmp, 
        colour_by = lab_col,
        size_by = "nGene")

# PCA plot after seurat normalization
plotPCA(sce_seunorm,
        colour_by = lab_col,
        size_by = "nGene")

Build unspliced assay

Select the same cells and genes as in the spliced dataset

# df -> matrix -> SCE + phenodata 
cells_use <- colnames(sce_qc)
genes_use <- rownames(sce_qc)

sce_us <- SingleCellExperiment(assays = list(counts = as.matrix(unspliced.data.df)), colData = pheno_matched, rowData = rownames(unspliced.data.df))

sce_us$nCounts <- colSums(counts(sce_us))
sce_us$nGene <- apply(counts(sce_us), 2, function(x) length(x[x>0]))

# Adding spike-in and MT counts:
is.spike <- grepl("^ERCC-", rownames(sce_us))
sce_us <- splitAltExps(sce_us, ifelse(is.spike, "ERCC", "gene"))

is.MT <- rownames(sce_us) %in% MT_genes
sce_us <- splitAltExps(sce_us, ifelse(is.MT, "MT", "gene"))

# Dataset after filtering:
sce_usmatch <- sce_us[genes_use,cells_use]

# Calculate the quality metrics:
sce_us <- addPerCellQCMetrics(sce_us)
sce_usmatch <- addPerCellQCMetrics(sce_usmatch)

# Arbitrary thresholds:
# Looking at the total number of RNA molecules per sample
# UMI counts were used for this experiment
hist(sce_us$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")

# Looking at the amount of unique genes per sample
# This is the amount with ERCC included.
hist(sce_us$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")

hist(sce_usmatch$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")

hist(sce_usmatch$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")

pdf("Histograms_before+aftercellsFiltering_UnsplicedReads.pdf")
par(mfrow=c(2,2))
hist(sce_us$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")
hist(sce_us$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")
hist(sce_usmatch$nCounts, breaks = 100)
abline(v = total_counts_tresh, col = "red")
hist(sce_usmatch$nGene, breaks = 100)
abline(v= total_feat_tresh, col = "red")
dev.off()
## png 
##   2

Build Seurat object with unspliced and spliced assay

unspliced_match <- unspliced.data.df[genes_use,cells_use]
unspliced_match <- as.matrix(unspliced_match)

seu[["uf"]] <- CreateAssayObject(counts = unspliced_match)

seu <- NormalizeData(
    object = seu, assay = "sf",
    normalization.method = "LogNormalize", 
    scale.factor = 10000
)
seu <- NormalizeData(
    object = seu, assay = "uf",
    normalization.method = "LogNormalize", 
    scale.factor = 10000
)

Highly variable genes & Scaling of the gene expression values

# FindVariableFeatures plots the dispersion (= a normalized measure of cell-to-cell variation), as a function of average expression for each gene. 
# In their tutorial the Satija lab uses the cut-off of 2000 genes.
seu <- FindVariableFeatures(
    object = seu, assay = "sf",
    selection.method = "vst",
    nfeatures = nHVG)

seu <- FindVariableFeatures(
    object = seu, assay = "uf",
    selection.method = "vst",
    nfeatures = nHVG)

# top 10 most variable genes
top20 <- head(VariableFeatures(seu, assay = "sf"), 20)
top20_uf <- head(VariableFeatures(seu, assay = "uf"), 20)

# plot variable features with labels:
plot1 <- VariableFeaturePlot(seu)
plot2 <- LabelPoints(plot = plot1, points = top20, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot3 <- VariableFeaturePlot(seu, assay = "uf")
plot4 <- LabelPoints(plot = plot3, points = top20_uf, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot4
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2126 rows containing missing values (geom_point).
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Preferable removing the genes that are highly expressed but with a low variance.
length(x = seu@assays$sf@var.features)
## [1] 2000
seu[["sf"]]@var.features[1:10]
##  [1] "NPPB"     "DLK1"     "ACTA1"    "MYL2"     "AFP"      "SERPINE1"
##  [7] "CTGF"     "MGP"      "PENK"     "TFPI2"
# Scaling the data to make it usable for dimensional reduction 
# using all the genes, could also select only the highly variable genes. 
all.genes <- rownames(seuset)
seu <- ScaleData(
    object = seu,  vars.to.regress = vars_to_regress,
    assay = "sf",
    features = all.genes
)
## Regressing out nCount_sf
## Centering and scaling data matrix
seu <- ScaleData(
    object = seu,  vars.to.regress = vars_to_regress,
    assay = "uf",
    features = all.genes
)
## Regressing out nCount_sf
## Centering and scaling data matrix

Running PCA analysis on the scaled data

seuset <- seu
rm(seu)
DefaultAssay(seuset) <- "sf"
seuset <- RunPCA(
    object = seuset,
    features = VariableFeatures(object = seuset), 
    npcs = pcs_max,
    ndims.print = 1:5, 
    nfeatures.print = 5
)
## PC_ 1 
## Positive:  TNNT2, ACTN2, SMPX, MYL7, ACTC1 
## Negative:  LIN28A, CNTNAP2, MARCKS, PRTG, GJA1 
## PC_ 2 
## Positive:  TTN, TNNI1, NEBL, COL2A1, MYL4 
## Negative:  COL3A1, LOX, FBN1, COL6A3, ETS1 
## PC_ 3 
## Positive:  TOP2A, CENPF, TPX2, DEPDC1, UBE2C 
## Negative:  MDK, VCAN, TGFB2, FN1, PKIA 
## PC_ 4 
## Positive:  CDH5, CD34, MYCT1, MMRN2, CD93 
## Negative:  COL3A1, COL1A1, COL1A2, COL6A3, TPM1 
## PC_ 5 
## Positive:  LINC00261, EPCAM, FOXA2, MAL2, CDH1 
## Negative:  CDH5, CD93, CD34, MMRN2, MYCT1
length(seuset[["sf"]]@var.features)
## [1] 2000
length(seuset[["uf"]]@var.features)
## [1] 2000

Visualizing PCA results:

#PrintPCA(object = seuset.scnorm, pcs.print = 1:5, genes.print = 5, use.full = FALSE)

VizDimLoadings(object = seuset, dims = 1:10, reduction = "pca")

VizDimLoadings(object = seuset, dims = 10:20, reduction = "pca")

pdf(paste0("VizPCAplot_PCs1-", pcs_max, ".pdf"), width = 20, height = 60)
VizDimLoadings(object = seuset, dims = 1:pcs_max, reduction = "pca")
dev.off()
## png 
##   2
DimPlot(object = seuset, reduction = "pca", group.by = lab_col)

# Helping in choosing the PCs to include in the analysis
DimHeatmap(
    object = seuset, 
    dims = 1:5, 
    cells = 500, 
    balanced = TRUE
)

pdf(paste0("PCheatmap_PCs1-", pcs_max, ".pdf"), width = 20, height = 60)
DimHeatmap(
    object = seuset, 
    dims = 1:pcs_max, 
    cells = 500, 
    balanced = TRUE
)
dev.off()
## png 
##   2

Perform JackStraw Permutations to find significant PCs

# seuset.jack <- JackStraw(
#     object = seuset,
#     num.replicate = 100
# )
# seuset.jack <- ScoreJackStraw(seuset.jack, dims = 1:20)
# JackStrawPlot(object = seuset.jack, dims = 1:20)

Plotting Elbow plot to identify significant PCs

This plot displays the standard deviations of the PCs and the

ElbowPlot(object = seuset, ndims = 35)

Overview of different UMAPs with varying dimensional input

# Generating a combined plot with only a legend in the first plotted (since this will be the same for the others)
plot.list <- list()
plot.list2 <- list()
label.vector <- c(umap_col, umap_col2)
j = 0

## This could be done nicer with a loop probably, in which for each principle component of interest, there is a umap run, and 2 different labels are shown for it.

for (i in (1:length(pcs_for_overview))){
  seuset <- RunUMAP(seuset, dims = 1:pcs_for_overview[i])
  dimnr <- as.character(pcs_for_overview[i])
  print(dimnr)
  if (i == 1){
    plot.list[[dimnr]] <- DimPlot(seuset, reduction = "umap", group.by = label.vector[1], combine = TRUE) + ggtitle(paste0("UMAP 1:", dimnr))
    plot.list2[[dimnr]] <- DimPlot(seuset, reduction = "umap", group.by = label.vector[2], combine = TRUE) + ggtitle(paste0("UMAP 1:", dimnr))
    i = i + 1
  }
  else {
    plot.list[[dimnr]] <- DimPlot(seuset, reduction = "umap", group.by = label.vector[1], combine = TRUE) + ggtitle(paste0("UMAP 1:", dimnr)) + theme(legend.position = "none")
    plot.list2[[dimnr]] <- DimPlot(seuset, reduction = "umap", group.by = label.vector[2], combine = TRUE) + ggtitle(paste0("UMAP 1:", dimnr)) + theme(legend.position = "none")
  i = i + 1
  }
}
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:39:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:39:05 Read 12706 rows and found 5 numeric columns
## 17:39:05 Using Annoy for neighbor search, n_neighbors = 30
## 17:39:05 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:39:07 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e413f5a124c
## 17:39:07 Searching Annoy index using 1 thread, search_k = 3000
## 17:39:12 Annoy recall = 100%
## 17:39:13 Commencing smooth kNN distance calibration using 1 thread
## 17:39:14 Initializing from normalized Laplacian + noise
## 17:39:15 Commencing optimization for 200 epochs, with 472238 positive edges
## 17:39:20 Optimization finished
## [1] "5"
## 17:39:20 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:39:20 Read 12706 rows and found 10 numeric columns
## 17:39:20 Using Annoy for neighbor search, n_neighbors = 30
## 17:39:20 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:39:21 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e41218ce22
## 17:39:21 Searching Annoy index using 1 thread, search_k = 3000
## 17:39:26 Annoy recall = 100%
## 17:39:26 Commencing smooth kNN distance calibration using 1 thread
## 17:39:28 Initializing from normalized Laplacian + noise
## 17:39:28 Commencing optimization for 200 epochs, with 513798 positive edges
## 17:39:33 Optimization finished
## [1] "10"
## 17:39:34 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:39:34 Read 12706 rows and found 20 numeric columns
## 17:39:34 Using Annoy for neighbor search, n_neighbors = 30
## 17:39:34 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:39:35 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e4166538d30
## 17:39:35 Searching Annoy index using 1 thread, search_k = 3000
## 17:39:38 Annoy recall = 100%
## 17:39:39 Commencing smooth kNN distance calibration using 1 thread
## 17:39:40 Initializing from normalized Laplacian + noise
## 17:39:41 Commencing optimization for 200 epochs, with 528916 positive edges
## 17:39:47 Optimization finished
## [1] "20"
## 17:39:47 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:39:47 Read 12706 rows and found 25 numeric columns
## 17:39:47 Using Annoy for neighbor search, n_neighbors = 30
## 17:39:47 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:39:48 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e416548fb0a
## 17:39:48 Searching Annoy index using 1 thread, search_k = 3000
## 17:39:51 Annoy recall = 100%
## 17:39:52 Commencing smooth kNN distance calibration using 1 thread
## 17:39:53 Initializing from normalized Laplacian + noise
## 17:39:54 Commencing optimization for 200 epochs, with 532022 positive edges
## 17:40:00 Optimization finished
## [1] "25"
## 17:40:00 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:40:00 Read 12706 rows and found 30 numeric columns
## 17:40:00 Using Annoy for neighbor search, n_neighbors = 30
## 17:40:00 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:40:01 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e4139607dd1
## 17:40:01 Searching Annoy index using 1 thread, search_k = 3000
## 17:40:04 Annoy recall = 100%
## 17:40:05 Commencing smooth kNN distance calibration using 1 thread
## 17:40:06 Initializing from normalized Laplacian + noise
## 17:40:07 Commencing optimization for 200 epochs, with 535296 positive edges
## 17:40:12 Optimization finished
## [1] "30"
## 17:40:13 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:40:13 Read 12706 rows and found 32 numeric columns
## 17:40:13 Using Annoy for neighbor search, n_neighbors = 30
## 17:40:13 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:40:14 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e415af924d8
## 17:40:14 Searching Annoy index using 1 thread, search_k = 3000
## 17:40:17 Annoy recall = 100%
## 17:40:18 Commencing smooth kNN distance calibration using 1 thread
## 17:40:19 Initializing from normalized Laplacian + noise
## 17:40:20 Commencing optimization for 200 epochs, with 535002 positive edges
## 17:40:25 Optimization finished
## [1] "32"
## 17:40:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:40:26 Read 12706 rows and found 35 numeric columns
## 17:40:26 Using Annoy for neighbor search, n_neighbors = 30
## 17:40:26 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:40:27 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e41676d0780
## 17:40:27 Searching Annoy index using 1 thread, search_k = 3000
## 17:40:30 Annoy recall = 100%
## 17:40:31 Commencing smooth kNN distance calibration using 1 thread
## 17:40:32 Initializing from normalized Laplacian + noise
## 17:40:33 Commencing optimization for 200 epochs, with 536362 positive edges
## 17:40:38 Optimization finished
## [1] "35"
## 17:40:39 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:40:39 Read 12706 rows and found 37 numeric columns
## 17:40:39 Using Annoy for neighbor search, n_neighbors = 30
## 17:40:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:40:40 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e411d605251
## 17:40:40 Searching Annoy index using 1 thread, search_k = 3000
## 17:40:43 Annoy recall = 100%
## 17:40:44 Commencing smooth kNN distance calibration using 1 thread
## 17:40:45 Initializing from normalized Laplacian + noise
## 17:40:46 Commencing optimization for 200 epochs, with 536938 positive edges
## 17:40:52 Optimization finished
## [1] "37"
## 17:40:52 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:40:52 Read 12706 rows and found 40 numeric columns
## 17:40:52 Using Annoy for neighbor search, n_neighbors = 30
## 17:40:52 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:40:53 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e41222ee182
## 17:40:53 Searching Annoy index using 1 thread, search_k = 3000
## 17:40:56 Annoy recall = 100%
## 17:40:57 Commencing smooth kNN distance calibration using 1 thread
## 17:40:58 Initializing from normalized Laplacian + noise
## 17:40:59 Commencing optimization for 200 epochs, with 538494 positive edges
## 17:41:05 Optimization finished
## [1] "40"
## 17:41:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:41:05 Read 12706 rows and found 50 numeric columns
## 17:41:05 Using Annoy for neighbor search, n_neighbors = 30
## 17:41:05 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:41:06 Writing NN index file to temp file /scratch/snabel/tmp/r4rs/Rtmp3P3S68/file3d0e41152a6652
## 17:41:06 Searching Annoy index using 1 thread, search_k = 3000
## 17:41:09 Annoy recall = 100%
## 17:41:10 Commencing smooth kNN distance calibration using 1 thread
## 17:41:11 Initializing from normalized Laplacian + noise
## 17:41:12 Commencing optimization for 200 epochs, with 541936 positive edges
## 17:41:18 Optimization finished
## [1] "50"
pdf(paste0("UMAPdiffsettings_", paste(as.character(pcs_for_overview), collapse = "-"), label.vector[1], ".pdf"), width = 20, height = 15)
CombinePlots(plot.list, nrows = round(length(pcs_for_overview)/3))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
dev.off()
## png 
##   2
pdf(paste0("UMAPdiffsettings_", paste(as.character(pcs_for_overview), collapse = "-"), label.vector[2], ".pdf"), width = 20, height = 15)
CombinePlots(plot.list2, nrows = round(length(pcs_for_overview)/3))
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
dev.off()
## png 
##   2

Based on the heatmaps, elbow (as well as the JackStraw indicating these are significant as well) the first 6 PCs can be used for further analysis.

# Saving the dataset with the normalized, scaled and identified HVGs (stored in seuset.scnorm@var.genes).
saveRDS(seuset, file="seusetv3_scnormHVG_velocity.rds")
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /vol/mbconda/snabel/anaconda3/envs/kb_scrna_R_mon3/lib/libopenblasp-r0.3.17.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Matrix_1.3-4                plot3D_1.4                 
##  [3] RColorBrewer_1.1-2          scran_1.20.1               
##  [5] SeuratObject_4.0.2          Seurat_4.0.4               
##  [7] scater_1.20.1               scuttle_1.2.1              
##  [9] SingleCellExperiment_1.14.1 SummarizedExperiment_1.22.0
## [11] Biobase_2.52.0              GenomicRanges_1.44.0       
## [13] GenomeInfoDb_1.28.4         IRanges_2.26.0             
## [15] S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [17] MatrixGenerics_1.4.3        matrixStats_0.60.1         
## [19] knitr_1.33                  limma_3.48.3               
## [21] mvoutlier_2.1.1             sgeostat_1.0-27            
## [23] tidyr_1.1.3                 dplyr_1.0.7                
## [25] ggplot2_3.3.5               devtools_2.4.2             
## [27] usethis_2.0.1              
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                reticulate_1.20          
##   [3] tidyselect_1.1.1          htmlwidgets_1.5.3        
##   [5] grid_4.1.0                BiocParallel_1.26.2      
##   [7] Rtsne_0.15                munsell_0.5.0            
##   [9] ScaledMatrix_1.0.0        codetools_0.2-18         
##  [11] ica_1.0-2                 statmod_1.4.36           
##  [13] future_1.22.1             miniUI_0.1.1.1           
##  [15] misc3d_0.9-0              withr_2.4.2              
##  [17] colorspace_2.0-2          highr_0.9                
##  [19] ROCR_1.0-11               robustbase_0.93-8        
##  [21] tensor_1.5                listenv_0.8.0            
##  [23] labeling_0.4.2            GenomeInfoDbData_1.2.6   
##  [25] polyclip_1.10-0           farver_2.1.0             
##  [27] rprojroot_2.0.2           parallelly_1.27.0        
##  [29] vctrs_0.3.8               generics_0.1.0           
##  [31] xfun_0.25                 R6_2.5.1                 
##  [33] ggbeeswarm_0.6.0          rsvd_1.0.5               
##  [35] locfit_1.5-9.4            bitops_1.0-7             
##  [37] spatstat.utils_2.2-0      cachem_1.0.6             
##  [39] DelayedArray_0.18.0       assertthat_0.2.1         
##  [41] promises_1.2.0.1          scales_1.1.1             
##  [43] beeswarm_0.4.0            gtable_0.3.0             
##  [45] beachmat_2.8.1            globals_0.14.0           
##  [47] processx_3.5.2            goftest_1.2-2            
##  [49] rlang_0.4.11              splines_4.1.0            
##  [51] lazyeval_0.2.2            spatstat.geom_2.2-2      
##  [53] yaml_2.2.1                reshape2_1.4.4           
##  [55] abind_1.4-5               httpuv_1.6.2             
##  [57] tcltk_4.1.0               tools_4.1.0              
##  [59] ellipsis_0.3.2            spatstat.core_2.3-0      
##  [61] jquerylib_0.1.4           sessioninfo_1.1.1        
##  [63] ggridges_0.5.3            Rcpp_1.0.7               
##  [65] plyr_1.8.6                sparseMatrixStats_1.4.2  
##  [67] zlibbioc_1.38.0           purrr_0.3.4              
##  [69] RCurl_1.98-1.4            ps_1.6.0                 
##  [71] prettyunits_1.1.1         rpart_4.1-15             
##  [73] deldir_0.2-10             pbapply_1.4-3            
##  [75] viridis_0.6.1             cowplot_1.1.1            
##  [77] zoo_1.8-9                 ggrepel_0.9.1            
##  [79] cluster_2.1.2             fs_1.5.0                 
##  [81] magrittr_2.0.1            RSpectra_0.16-0          
##  [83] data.table_1.14.0         scattermore_0.7          
##  [85] lmtest_0.9-38             RANN_2.6.1               
##  [87] fitdistrplus_1.1-5        pkgload_1.2.1            
##  [89] patchwork_1.1.1           mime_0.11                
##  [91] evaluate_0.14             xtable_1.8-4             
##  [93] gridExtra_2.3             testthat_3.0.4           
##  [95] compiler_4.1.0            tibble_3.1.4             
##  [97] KernSmooth_2.23-20        crayon_1.4.1             
##  [99] htmltools_0.5.2           mgcv_1.8-36              
## [101] later_1.3.0               DBI_1.1.1                
## [103] MASS_7.3-54               cli_3.0.1                
## [105] metapod_1.0.0             igraph_1.2.6             
## [107] pkgconfig_2.0.3           plotly_4.9.4.1           
## [109] spatstat.sparse_2.0-0     vipor_0.4.5              
## [111] bslib_0.3.0               dqrng_0.3.0              
## [113] XVector_0.32.0            stringr_1.4.0            
## [115] callr_3.7.0               digest_0.6.27            
## [117] sctransform_0.3.2         RcppAnnoy_0.0.19         
## [119] spatstat.data_2.1-0       rmarkdown_2.10           
## [121] leiden_0.3.9              uwot_0.1.10              
## [123] edgeR_3.34.1              DelayedMatrixStats_1.14.3
## [125] shiny_1.6.0               lifecycle_1.0.0          
## [127] nlme_3.1-152              jsonlite_1.7.2           
## [129] BiocNeighbors_1.10.0      desc_1.3.0               
## [131] viridisLite_0.4.0         fansi_0.5.0              
## [133] pillar_1.6.2              lattice_0.20-44          
## [135] fastmap_1.1.0             httr_1.4.2               
## [137] DEoptimR_1.0-9            pkgbuild_1.2.0           
## [139] survival_3.2-13           glue_1.4.2               
## [141] remotes_2.4.0             png_0.1-7                
## [143] bluster_1.2.1             stringi_1.7.4            
## [145] sass_0.4.0                BiocSingular_1.8.1       
## [147] memoise_2.0.0             irlba_2.3.3              
## [149] future.apply_1.8.1

Now use this file in the Velocyto.R dedicated Conda environment:

conda activate kb_scrna_velocyto2